Contents

1 Required libraries and data

1.1 Load needed libraries

library(tidyverse)
library(GGally)
library(UpSetR)
library(pheatmap)
library(cowplot)

1.2 Load number of assigned cells

data = 'Replogle_K562'

# Varying UMI threshold
n_assigned_cells_UMI <- read_csv(paste0(data_dir, 'guide_calling/', data, '/UMI/', 
                                    'all_assignments.csv'), show_col_types = FALSE) 
n_assigned_cells_UMI_single <- read_csv(paste0(data_dir, 'guide_calling/', data, '/UMI/', 
                                    'single_assignments.csv'), show_col_types = FALSE) 

# Varying ratio threshold
n_assigned_cells_ratios <- read_csv(paste0(data_dir, 'guide_calling/', data, '/ratios/', 
                                    'all_assignments.csv'), show_col_types = FALSE) 
n_assigned_cells_ratios_single <- read_csv(paste0(data_dir, 'guide_calling/', data, '/ratios/', 
                                    'single_assignments.csv'), show_col_types = FALSE) 

# Varying quantile threshold
n_assigned_cells_q <- read_csv(paste0(data_dir, 'guide_calling/', data, '/quantiles/', 
                                    'all_assignments.csv'), show_col_types = FALSE) 
n_assigned_cells_q_single <- read_csv(paste0(data_dir, 'guide_calling/', data, '/quantiles/', 
                                    'single_assignments.csv'), show_col_types = FALSE) 
# add vertical lines for chosen thresholds
UMI_t = 15
ratio_t = 30
quantile_t = 5

all_t <- data.frame(assignment = c('UMI_t', 'ratio_X%', 'top_X%_cells'),
                    threshold = c(UMI_t, ratio_t, quantile_t))

1.3 Load power check results for various guide calling approaches

get_power_check_results <- function(data, name, method_names){
  method_name = method_names[method_names$dir_name == name, 'method']
  df = readRDS(paste0(data_dir, 'sceptre_pipeline/', data, '/power_check/', name, 
                                '/results_run_power_check.rds')) %>%
    filter(pass_qc == TRUE) %>%
    mutate(method = method_name)
  df <- df %>% mutate(p_adj = p.adjust(df$p_value, method="BH"))
  return(df)
}
# Varying UMI threshold
dir_names_UMI = c("UMI/t2", "UMI/t5", "UMI/t10", "UMI/t15", "UMI/t20", 
              "UMI/t30", "UMI/t40", "UMI/t50", "UMI/t60", "UMI/t80", 
              "UMI/t100", "UMI/t120", "UMI/t140", "UMI/t160", "UMI/t180", 
              "UMI/t200")
method_names_UMI <- data.frame('method' = c(2,5,10,15,20,30,40,50,60,80,100,120,140,160,180,200), 
                           'dir_name' = dir_names_UMI)
power_check_UMI <- lapply(dir_names_UMI, function(name) {get_power_check_results(data, name, method_names_UMI)}) %>% 
  bind_rows()

# Varying ratio threshold
dir_names_ratio = c("ratios/t0.1", "ratios/t0.2", "ratios/t0.3", "ratios/t0.4", "ratios/t0.5", "ratios/t0.6", 
              "ratios/t0.7", "ratios/t0.8", "ratios/t0.9")
method_names_ratio <- data.frame('method' = c(10, 20, 30, 40, 50, 60, 70, 80, 90), 
                           'dir_name' = dir_names_ratio)
power_check_ratios <- lapply(dir_names_ratio, function(name) {get_power_check_results(data, name, method_names_ratio)}) %>% 
  bind_rows()

# Varying quantile threshold
dir_names_q = c("quantiles/q0.01", "quantiles/q0.025", "quantiles/q0.05", "quantiles/q0.075", "quantiles/q0.1",
              "quantiles/q0.2", "quantiles/q0.3", "quantiles/q0.4", "quantiles/q0.5")
method_names_q <- data.frame('method' = c(1, 2.5, 5, 7.5, 10, 20, 30, 40, 50), 
                           'dir_name' = dir_names_q)
power_check_quantiles <- lapply(dir_names_q, function(name) {get_power_check_results(data, name, method_names_q)}) %>% 
  bind_rows()

1.4 Load calibration results for various guide calling approaches

get_calibration_results <- function(data, name, method_names){
  method_name = method_names[method_names$dir_name == name, 'method']
  df = readRDS(paste0(data_dir, 'sceptre_pipeline/', data, '/discovery_analysis/', name, 
                           '/results_run_calibration_check.rds')) %>%
    filter(pass_qc == TRUE) %>%
    mutate(method = method_name)
  df <- df %>% mutate(p_adj = p.adjust(df$p_value, method="BH"))
  return(df)
}
# Varying UMI threshold
calibration_UMI <- lapply(dir_names_UMI, function(name) {get_calibration_results(data, name, method_names_UMI)}) %>% 
  bind_rows() %>%
  group_by(method) %>%
  mutate(total_tests = n()) %>%
  filter(significant == TRUE) %>%
  group_by(method, total_tests) %>%
  summarize(false_discoveries = n()) %>% 
  right_join(method_names_UMI) %>% 
  replace(is.na(.), 0)
## `summarise()` has grouped output by 'method'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(method)`
# Varying ratio threshold
calibration_ratios <- lapply(dir_names_ratio, function(name) {get_calibration_results(data, name, method_names_ratio)}) %>% 
  bind_rows() %>%
  group_by(method) %>%
  mutate(total_tests = n()) %>%
  filter(significant == TRUE) %>%
  group_by(method, total_tests) %>%
  summarize(false_discoveries = n())  %>% 
  right_join(method_names_ratio) %>% 
  replace(is.na(.), 0)
## `summarise()` has grouped output by 'method'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(method)`
# Varying quantile threshold
calibration_quantiles <- lapply(dir_names_q, function(name) {get_calibration_results(data, name, method_names_q)}) %>% 
  bind_rows() %>%
  group_by(method) %>%
  mutate(total_tests = n()) %>%
  filter(significant == TRUE) %>%
  group_by(method, total_tests) %>%
  summarize(false_discoveries = n())  %>% 
  right_join(method_names_q) %>% 
  replace(is.na(.), 0)
## `summarise()` has grouped output by 'method'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(method)`

1.5 Load discovery analysis results for various guide calling approaches

get_discovery_analysis_results <- function(data, name, method_names){
  method_name = method_names[method_names$dir_name == name, 'method']
  df <- readRDS(paste0(data_dir, 'sceptre_pipeline/', data, '/discovery_analysis/', name, 
                           '/results_run_discovery_analysis.rds')) %>%
    filter(pass_qc == TRUE, significant == TRUE) %>%
    mutate(method = method_name)
}
# varying UMI threshold
discovery_analysis_UMI <- lapply(dir_names_UMI, function(name) {get_discovery_analysis_results(data, name, method_names_UMI)}) %>% 
  bind_rows()

# varying ratio threshold
discovery_analysis_ratios <- lapply(dir_names_ratio, function(name) {get_discovery_analysis_results(data, name, method_names_ratio)}) %>% 
  bind_rows()

# varying quantile threshold
discovery_analysis_q <- lapply(dir_names_q, function(name) {get_discovery_analysis_results(data, name, method_names_q)}) %>% 
  bind_rows()

2 Number of assignments

2.1 Varying UMI threshold

u1 <- ggplot(n_assigned_cells_UMI, aes(x = method, y = n_cells)) + 
  geom_vline(xintercept = UMI_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2, shape = 4) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'UMI threshold', y = 'Total number of assigned cells') 
u1

u2 <- ggplot(n_assigned_cells_UMI_single, aes(x = method, y = n_cells)) + 
  geom_vline(xintercept = UMI_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'UMI threshold', y = 'Number of uniquely assigned cells')
u2

2.2 Varying ratio threshold

r1 <- ggplot(n_assigned_cells_ratios, aes(x = method, y = n_cells)) + 
  geom_vline(xintercept = ratio_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2, shape = 4) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Ratio threshold', y = 'Total number of assigned cells')
r1

r2 <- ggplot(n_assigned_cells_ratios_single, aes(x = method, y = n_cells)) + 
  geom_vline(xintercept = ratio_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Ratio threshold', y = 'Number of uniquely assigned cells')
r2

2.3 Varying quantile threshold

q1 <- ggplot(n_assigned_cells_q, aes(x = method, y = n_cells)) + 
  geom_vline(xintercept = quantile_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2, shape = 4) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Quantile threshold', y = 'Total number of assigned cells')
q1

q2 <- ggplot(n_assigned_cells_q_single, aes(x = method, y = n_cells)) + 
  geom_vline(xintercept = quantile_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Quantile threshold', y = 'Number of uniquely assigned cells')
q2

n_assigned_cells <- n_assigned_cells_UMI %>% 
  mutate(assignment = 'UMI_t') %>%
  bind_rows(n_assigned_cells_ratios %>% mutate(assignment = 'ratio_X%')) %>%
  bind_rows(n_assigned_cells_q %>% mutate(assignment = "top_X%_cells"))

row1 <- ggplot(n_assigned_cells, aes(x = method, y = n_cells)) + 
  facet_wrap(~assignment, nrow = 1, scales = "free_x") +
  geom_point(color =  '#3776ab', size = 2, shape = 4) +
  geom_line(color =  '#3776ab') +
  geom_vline(data = all_t, aes(xintercept = threshold), linetype="dashed") +
  theme_bw() + 
  labs(x = 'Threshold', y = 'Total number of assigned cells')
n_single_assigned_cells <- n_assigned_cells_UMI_single %>% 
  mutate(assignment = 'UMI_t') %>%
  bind_rows(n_assigned_cells_ratios_single %>% mutate(assignment = 'ratio_X%')) %>%
  bind_rows(n_assigned_cells_q_single %>% mutate(assignment = "top_X%_cells"))

row2 <- ggplot(n_single_assigned_cells, aes(x = method, y = n_cells)) + 
  facet_wrap(~assignment, nrow = 1, scales = "free_x") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  geom_vline(data = all_t, aes(xintercept = threshold), linetype="dashed") +
  theme_bw() + 
  labs(x = 'Threshold', y = 'Number of uniquely\nassigned cells')

3 Analysis of true positives (power check on target genes)

3.1 Varying UMI threshold

# Number of significantly downregulated target genes
signif_targets_UMI <- power_check_UMI %>% filter(p_adj < 0.05) %>% 
  group_by(method) %>% 
  summarize(signif_targets = n()) 

u3 <- ggplot(signif_targets_UMI, aes(x = method, y = signif_targets)) +
  geom_vline(xintercept = UMI_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'UMI threshold', y = 'Number of significant target genes')
u3

# Median lfc over targets that pass qc for all thresholds
all_targets <- group_by(power_check_UMI, grna_target) %>% summarize(pass_QC = n()) %>% filter(pass_QC == 16)
lfc_UMI <- filter(power_check_UMI, grna_target %in% all_targets$grna_target) %>%
  group_by(method) %>%
  summarize(median_lfc = median(-log_2_fold_change))

u4 <- ggplot(lfc_UMI, aes(x = method, y = median_lfc)) +
  geom_vline(xintercept = UMI_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'UMI threshold', y = 'Median negative log2 fold\nchange of the target gene')
u4

3.2 Varying ratio threshold

# Number of significantly downregulated target genes
signif_targets_ratios <- power_check_ratios %>% filter(p_adj < 0.05) %>% 
  group_by(method) %>% 
  summarize(signif_targets = n()) 

r3 <- ggplot(signif_targets_ratios, aes(x = method, y = signif_targets)) +
  geom_vline(xintercept = ratio_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Ratio threshold', y = 'Number of significant target genes')
r3

# Median lfc over targets that pass qc for all thresholds
all_targets <- group_by(power_check_ratios, grna_target) %>% summarize(pass_QC = n()) %>% filter(pass_QC == 9)
lfc_ratios <- filter(power_check_ratios, grna_target %in% all_targets$grna_target) %>%
  group_by(method) %>%
  summarize(median_lfc = median(-log_2_fold_change))

r4 <- ggplot(lfc_ratios, aes(x = method, y = median_lfc)) +
  geom_vline(xintercept = ratio_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Ratio threshold', y = 'Median negative log2 fold\nchange of the target gene')
r4

3.3 Varying quantile threshold

# Number of significantly downregulated target genes
signif_targets_quantiles <- power_check_quantiles %>% filter(p_adj < 0.05) %>% 
  group_by(method) %>% 
  summarize(signif_targets = n()) 

q3 <- ggplot(signif_targets_quantiles, aes(x = method, y = signif_targets)) +
  geom_vline(xintercept = quantile_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Quantile threshold', y = 'Number of significant target genes')
q3

# Median lfc over targets that pass qc for all thresholds
all_targets <- group_by(power_check_quantiles, grna_target) %>% summarize(pass_QC = n()) %>% filter(pass_QC == 9)
lfc_q <- filter(power_check_quantiles, grna_target %in% all_targets$grna_target) %>%
  group_by(method) %>%
  summarize(median_lfc = median(-log_2_fold_change, na.rm = TRUE))

q4 <- ggplot(lfc_q, aes(x = method, y = median_lfc)) +
  geom_vline(xintercept = quantile_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Quantile threshold', y = 'Median negative log2 fold\nchange of the target gene')
q4

signif_targets <- signif_targets_UMI %>% 
  mutate(assignment = 'UMI_t') %>%
  bind_rows(signif_targets_ratios %>% mutate(assignment = 'ratio_X%')) %>%
  bind_rows(signif_targets_quantiles %>% mutate(assignment = "top_X%_cells"))

row3 <- ggplot(signif_targets, aes(x = method, y = signif_targets)) +
  facet_wrap(~assignment, nrow = 1, scales = "free_x") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  geom_vline(data = all_t, aes(xintercept = threshold), linetype="dashed") +
  theme_bw() + 
  labs(x = 'Threshold', y = 'Number of significant\ntarget genes')
lfc <- lfc_UMI %>% 
  mutate(assignment = 'UMI_t') %>%
  bind_rows(lfc_ratios %>% mutate(assignment = 'ratio_X%')) %>%
  bind_rows(lfc_q %>% mutate(assignment = "top_X%_cells"))

row4 <- ggplot(lfc, aes(x = method, y = median_lfc)) +
  facet_wrap(~assignment, nrow = 1, scales = "free_x") +
  geom_vline(data = all_t, aes(xintercept = threshold), linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Threshold', y = 'Median negative log2 fold\nchange of the target gene')

4 Discovery analysis

4.1 Varying UMI threshold

# Get the number of response genes per perturbation
n_discovery_UMI <- discovery_analysis_UMI %>% 
  group_by(method) %>%
  summarize(n_response_genes = n())

u5 <- ggplot(n_discovery_UMI, aes(x = method, y = n_response_genes)) + 
  geom_vline(xintercept = UMI_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'UMI threshold', y = 'Number of total discoveries')
u5

u6 <- ggplot(calibration_UMI, aes(x = factor(method), y = false_discoveries)) +
  geom_bar(stat = 'identity', fill = '#3776ab') + 
  theme_bw() + 
  labs(x = 'UMI threshold', y = paste0('Number of false discoveries'))
u6

4.2 Varying ratio threshold

# Get the number of response genes per perturbation
n_discovery_ratios <- discovery_analysis_ratios %>% 
  group_by(method) %>%
  summarize(n_response_genes = n())

r5 <- ggplot(n_discovery_ratios, aes(x = method, y = n_response_genes)) + 
  geom_vline(xintercept = ratio_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Ratio threshold', y = 'Number of total discoveries')
r5

r6 <- ggplot(calibration_ratios, aes(x = factor(method), y = false_discoveries)) +
  geom_bar(stat = 'identity', fill = '#3776ab') + 
  theme_bw() + 
  labs(x = 'Ratio threshold', y = paste0('Number of false discoveries'))
r6

4.3 Varying quantile threshold

# Get the number of response genes per perturbation
n_discovery_q <- discovery_analysis_q %>% 
  group_by(method) %>%
  summarize(n_response_genes = n())

q5 <- ggplot(n_discovery_q, aes(x = method, y = n_response_genes)) + 
  geom_vline(xintercept = quantile_t, linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Quantile threshold', y = 'Number of total discoveries')
q5

q6 <- ggplot(calibration_quantiles, aes(x = factor(method), y = false_discoveries)) +
  geom_bar(stat = 'identity', fill = '#3776ab') + 
  theme_bw() + 
  labs(x = 'Quantile threshold', y = paste0('Number of false discoveries'))
q6

n_discovery <- n_discovery_UMI %>% 
  mutate(assignment = 'UMI_t') %>%
  bind_rows(n_discovery_ratios %>% mutate(assignment = 'ratio_X%')) %>%
  bind_rows(n_discovery_q %>% mutate(assignment = "top_X%_cells"))

row5 <- ggplot(n_discovery, aes(x = method, y = n_response_genes)) + 
  facet_wrap(~assignment, nrow = 1, scales = "free_x") +
  geom_vline(data = all_t, aes(xintercept = threshold), linetype="dashed") +
  geom_point(color =  '#3776ab', size = 2) +
  geom_line(color =  '#3776ab') +
  theme_bw() + 
  labs(x = 'Threshold', y = 'Number of discoveries')
calibration <- calibration_UMI %>% 
  mutate(assignment = 'UMI_t') %>%
  bind_rows(calibration_ratios %>% mutate(assignment = 'ratio_X%')) %>%
  bind_rows(calibration_quantiles %>% mutate(assignment = "top_X%_cells"))

row6 <- ggplot(calibration, aes(x = factor(method), y = false_discoveries)) +
  facet_wrap(~assignment, nrow = 1, scales = "free_x") +
  geom_bar(stat = 'identity', fill = '#3776ab') + 
  theme_bw() + 
  labs(x = 'Threshold', y = paste0('Number of false discoveries'))
plot_grid(row1, row2, row4, row3, row5, row6, ncol = 1, labels = "AUTO")

plot_grid(u1, u2, u4, u3, u5, u6, r1, r2, r4, r3, r5, r6, q1, q2, q4, q3, q5, q6,
          labels = c("A(i)", "(ii)", "(iii)", "(iv)", "(v)", "(vi)",
                     "B(i)", "(ii)", "(iii)", "(iv)", "(v)", "(vi)",
                     "C(i)", "(ii)", "(iii)", "(iv)", "(v)", "(vi)"), 
          ncol = 6)

plot_grid(u1, r1, q1, u2, r2, q2, u4, r4, q4, u3, r3, q3, u5, r5, q5, u6, r6, q6,
          labels = c("A(i)", "(ii)", "(iii)", 
                     "B(i)", "(ii)", "(iii)",  
                     "C(i)", "(ii)", "(iii)",
                     "D(i)", "(ii)", "(iii)", 
                     "E(i)", "(ii)", "(iii)",
                     "F(i)", "(ii)", "(iii)"), 
          ncol = 3)

5 Session info

sessionInfo()
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.2.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] cowplot_1.1.3    pheatmap_1.0.12  UpSetR_1.4.0     GGally_2.2.1    
##  [5] lubridate_1.9.3  forcats_1.0.0    stringr_1.5.1    dplyr_1.1.4     
##  [9] purrr_1.0.2      readr_2.1.5      tidyr_1.3.1      tibble_3.2.1    
## [13] ggplot2_3.5.1    tidyverse_2.0.0  BiocStyle_2.30.0
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.9          utf8_1.2.4          generics_0.1.3     
##  [4] stringi_1.8.4       hms_1.1.3           digest_0.6.36      
##  [7] magrittr_2.0.3      evaluate_0.24.0     grid_4.3.2         
## [10] timechange_0.3.0    RColorBrewer_1.1-3  bookdown_0.40      
## [13] fastmap_1.2.0       plyr_1.8.9          jsonlite_1.8.8     
## [16] tinytex_0.52        gridExtra_2.3       BiocManager_1.30.23
## [19] fansi_1.0.6         scales_1.3.0        jquerylib_0.1.4    
## [22] cli_3.6.3           crayon_1.5.3        rlang_1.1.4        
## [25] bit64_4.0.5         munsell_0.5.1       withr_3.0.0        
## [28] cachem_1.1.0        yaml_2.3.9          parallel_4.3.2     
## [31] tools_4.3.2         tzdb_0.4.0          colorspace_2.1-0   
## [34] ggstats_0.6.0       vctrs_0.6.5         R6_2.5.1           
## [37] lifecycle_1.0.4     bit_4.0.5           vroom_1.6.5        
## [40] pkgconfig_2.0.3     pillar_1.9.0        bslib_0.7.0        
## [43] gtable_0.3.5        data.table_1.15.4   Rcpp_1.0.13        
## [46] glue_1.7.0          highr_0.11          xfun_0.46          
## [49] tidyselect_1.2.1    rstudioapi_0.16.0   knitr_1.48         
## [52] farver_2.1.2        htmltools_0.5.8.1   labeling_0.4.3     
## [55] rmarkdown_2.27      compiler_4.3.2